######################################################################
######################################################################
####            A estrada dos tijolos amarelos:                   ####
#### Como e por qu� produzir trabalhos qualitativos transparentes?####
####    Amanda Domingos, Palloma Marciano & Virginia Rocha        ####                            
####      (Corresponding author: amanda.domingos@ufpe.br)         ####                      
######################################################################
######################################################################

#Loading packages
library("rscielo")
library(stringi)
library(quanteda)
library(plyr)
library(dplyr)
library(broom)
library(tidyverse)
library(dplyr)
library(tidytext)
library(tm) 
library(class)
library(SnowballC)
library(topicmodels)
library(ggplot2)

#loading data
total <- read.csv("Data/OriginalData/total.csv")

##################
#### Abstract ####
##################

# removing accents, punctuation, capital letters and extra spaces
total$abstract_en <- stri_trans_general(total$abstract_en, "Latin-ASCII")
total$abstract_en <- stri_replace_all(total$abstract_en, "", regex = "[[:punct:]]")
total$abstract_en <- stri_replace_all(total$abstract_en, "", regex = "\\.")
total$abstract_en <- stri_trans_tolower(total$abstract_en)
total$abstract_en <- stri_sub(total$abstract_en)

#creating stopwords
sw <- stopwords("english")

#creating DTM
dtm <- dfm(total$abstract_en)

#creating and removing toks
toks <- tokens(total$abstract_en)
toks <- tokens_tolower(toks)
toks <- tokens_wordstem(toks)
tokens_remove(toks, sw)


#Looking for qualitative methods at the abstracts
quali <- dictionary(list(QCA = c("qualitative comparative analysis", "qca"),
                         process = c("process tracing"),
                         partobs = c ("participant observation"),
                         focus = c("focus groups", "focus group"),
                         npartos = c("non participative observation", "non participant observation"),
                         indepth = c("indepth interview", "indepth interviews"),
                         interview = c("interview", "interviews"),
                         ethnography = c("etnography", "ethnography"),
                         netnography = c("netnography"),
                         content = c("content analysis"),
                         documental = c("documental analysis", "document analysis"),
                         CMA = c("analysis of collective mindsets", "cma"),
                         dairies = c("dairies analysis"),
                         life = c("life story"),
                         peergroup = c("peer group", "peer groups"),
                         conflictgroup = c("conflict group", "conflict groups"),
                         casestudy = c("case study"),
                         settheory = c("set-theory", "set theory"),
                         CHA = c("comparative historial analysis", "cha"),
                         narrative = c("narrative analysis"),
                         Interpretivism = c("interpretivism"),
                         dataarchiving = c("data archiving"),
                         Discourseanalysis = c("Discourse analysis", "speech analysis"),
                         Rhetorical = c("rhetorical analysis"),
                         semiotic = c("semiotic analysis"),
                         grounded = c("grounded theory")))

tokens(total$abstract_en) %>%
  tokens_lookup(dictionary = quali, exclusive = FALSE)

dfquali <- tokens(total$abstract_en) %>%
  tokens_lookup(dictionary = quali) %>%
  dfm()

#Looking for quantitative methods at the abstracts
quanti <- dictionary(list(experimento = c("experiment"),
                          naturalexperiment = c("natural experiment"),
                          quasiexperiment = c("quasi experiment", "quasi-experiment"),
                          RDD = c("regression discontinuity design", "design rdd", "rdd", "regression discontinuity"),
                          DD = c("differences in differences", "diff-in-diff", "diff in diff", "(DD)", "DD", "difference in difference", "difference in difference estimator", "difference-in-differences", "difference in difference"),
                          Matching = c("propensity score matching", "PSM", "exact matching", "matching"),
                          SyntheticControl = c("synthetic control"),
                          IV = c("instrumental variables", "instrumental variable"),
                          Regression = c("regression model", "regression models", "logit",
                                         "probit", "poisson", "ols", "negative binomial", "hurdle model",
                                         "multilevel model", "multilevel regression",
                                         "hierarchical models","hierarchical regression",
                                         "Two-stage least squares", "two stage least squares",
                                         "panel regression"),
                          correlation = c("correlation model", "pearson correlation", "correlation analysis"),
                          anova = c("anova", "variance analysis"),
                          testet = c("t test", "independent t test", "Student's t-test", "student's t test"),
                          survival = c("survival analysis", "proportional hazards model", "cox regression"),
                          survey = c("survey", "survey interviews", "survey analysis", "survey experiment", "structured interview"),
                          textanalysis = c("text analysis", "text as data"),
                          cluster = c("cluster analysis")))


tokens(total$abstract_en) %>%
  tokens_lookup(dictionary = quanti, exclusive = FALSE)

dfquanti <- tokens(total$abstract_en) %>%
  tokens_lookup(dictionary = quanti) %>%
  dfm()

#Looking for transparency and replication at the abstracts
transp <- dictionary(list(replicabilidade = c("replicability", "replication simulations"),
                          reproducibilidade = c("reproducibility"),
                          opendata = c("open data"),
                          openscience = c("open science"),
                          repositories = c("data repository", "data repositories", "dataverse",
                                           "osf", "online repository", "qualitative data repository")))

tokens(total$abstract_en) %>%
  tokens_lookup(dictionary = transp, exclusive = FALSE)

dftransp <- tokens(total$abstract_en) %>%
  tokens_lookup(dictionary = transp) %>%
  dfm()

#Genarating datasets
freq_quali<- tidy(dfquali)
dfm_quali <- convert(dfquali, to = "data.frame")
freq_quanti <- tidy(dfquanti)
dfm_quanti <- convert(dfquanti, to = "data.frame")
freq_trans <- tidy(dftransp)
dfm_transp <- convert(dftransp, to = "data.frame")

class(dfm_transp)

#creating identifiers for each dataset for future merge
dfm_quali$id <- rownames(total)
dfm_quanti$id <- rownames(total)
dfm_transp$id <- rownames(total)
total$id <- rownames(total)


#Creating dummy for qualitative or quantitative method
#dfm_quali$quali <- ifelse(dfm_quali[2:28] >= 1, "1", "0")
#dfm_quali$quali <- ifelse(dfm_quali[2:28] > 1, 1,0)

#creating dummy variable for quanti, quali and transparency
dfm_quali[, "quali"] <- rowSums(dfm_quali[2:27] > 0)
dfm_quanti[, "quanti"] <- rowSums(dfm_quanti[2:17] > 0)
dfm_transp[, "transp"] <- rowSums(dfm_transp[2:6] > 0)


#merging
total <- join(total, dfm_quali)
total <- join(total, dfm_quanti)
total <- join(total, dfm_transp)


#saving dataset
write.csv(total, file = "Data/AnalysisData/total.csv")
write.csv(freq_quali, file = "Data/AnalysisData/quali.csv")
write.csv(freq_quanti, file = "Data/AnalysisData/quanti.csv")
write.csv(freq_trans, file = "Data/AnalysisData/transp.csv")
